{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": "      Type Method             Regionname  Rooms  Distance  Postcode  Bedroom2  \\\n12167    u      S  Southern Metropolitan      1       5.0    3182.0       1.0   \n6524     h     SA   Western Metropolitan      2       8.0    3016.0       2.0   \n8413     h      S   Western Metropolitan      3      12.6    3020.0       3.0   \n2919     u     SP  Northern Metropolitan      3      13.0    3046.0       3.0   \n6043     h      S   Western Metropolitan      3      13.3    3020.0       3.0   \n\n       Bathroom  Landsize  Lattitude  Longtitude  Propertycount  \n12167       1.0       0.0  -37.85984    144.9867        13240.0  \n6524        2.0     193.0  -37.85800    144.9005         6380.0  \n8413        1.0     555.0  -37.79880    144.8220         3755.0  \n2919        1.0     265.0  -37.70830    144.9158         8870.0  \n6043        1.0     673.0  -37.76230    144.8272         4217.0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Type</th>\n      <th>Method</th>\n      <th>Regionname</th>\n      <th>Rooms</th>\n      <th>Distance</th>\n      <th>Postcode</th>\n      <th>Bedroom2</th>\n      <th>Bathroom</th>\n      <th>Landsize</th>\n      <th>Lattitude</th>\n      <th>Longtitude</th>\n      <th>Propertycount</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>12167</th>\n      <td>u</td>\n      <td>S</td>\n      <td>Southern Metropolitan</td>\n      <td>1</td>\n      <td>5.0</td>\n      <td>3182.0</td>\n      <td>1.0</td>\n      <td>1.0</td>\n      <td>0.0</td>\n      <td>-37.85984</td>\n      <td>144.9867</td>\n      <td>13240.0</td>\n    </tr>\n    <tr>\n      <th>6524</th>\n      <td>h</td>\n      <td>SA</td>\n      <td>Western Metropolitan</td>\n      <td>2</td>\n      <td>8.0</td>\n      <td>3016.0</td>\n      <td>2.0</td>\n      <td>2.0</td>\n      <td>193.0</td>\n      <td>-37.85800</td>\n      <td>144.9005</td>\n      <td>6380.0</td>\n    </tr>\n    <tr>\n      <th>8413</th>\n      <td>h</td>\n      <td>S</td>\n      <td>Western Metropolitan</td>\n      <td>3</td>\n      <td>12.6</td>\n      <td>3020.0</td>\n      <td>3.0</td>\n      <td>1.0</td>\n      <td>555.0</td>\n      <td>-37.79880</td>\n      <td>144.8220</td>\n      <td>3755.0</td>\n    </tr>\n    <tr>\n      <th>2919</th>\n      <td>u</td>\n      <td>SP</td>\n      <td>Northern Metropolitan</td>\n      <td>3</td>\n      <td>13.0</td>\n      <td>3046.0</td>\n      <td>3.0</td>\n      <td>1.0</td>\n      <td>265.0</td>\n      <td>-37.70830</td>\n      <td>144.9158</td>\n      <td>8870.0</td>\n    </tr>\n    <tr>\n      <th>6043</th>\n      <td>h</td>\n      <td>S</td>\n      <td>Western Metropolitan</td>\n      <td>3</td>\n      <td>13.3</td>\n      <td>3020.0</td>\n      <td>3.0</td>\n      <td>1.0</td>\n      <td>673.0</td>\n      <td>-37.76230</td>\n      <td>144.8272</td>\n      <td>4217.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "data = pd.read_csv('../input/melbourne-housing-snapshot/melb_data.csv')\n",
    "\n",
    "y = data.Price\n",
    "X = data.drop(['Price'], axis=1)\n",
    "X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)\n",
    "\n",
    "cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]\n",
    "X_train_full.drop(cols_with_missing, axis=1, inplace=True)\n",
    "X_valid_full.drop(cols_with_missing, axis=1, inplace=True)\n",
    "\n",
    "low_cardinality_cols = [cname for cname in X_train_full.columns if\n",
    "                        X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == 'object']\n",
    "\n",
    "numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]\n",
    "\n",
    "my_cols = low_cardinality_cols + numerical_cols\n",
    "\n",
    "X_train = X_train_full[my_cols].copy()\n",
    "X_valid = X_valid_full[my_cols].copy()\n",
    "\n",
    "X_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "data": {
      "text/plain": "      Type Method             Regionname  Rooms  Distance  Postcode  Bedroom2  \\\n12167    u      S  Southern Metropolitan      1       5.0    3182.0       1.0   \n6524     h     SA   Western Metropolitan      2       8.0    3016.0       2.0   \n8413     h      S   Western Metropolitan      3      12.6    3020.0       3.0   \n2919     u     SP  Northern Metropolitan      3      13.0    3046.0       3.0   \n6043     h      S   Western Metropolitan      3      13.3    3020.0       3.0   \n\n       Bathroom  Landsize  Lattitude  Longtitude  Propertycount  \n12167       1.0       0.0  -37.85984    144.9867        13240.0  \n6524        2.0     193.0  -37.85800    144.9005         6380.0  \n8413        1.0     555.0  -37.79880    144.8220         3755.0  \n2919        1.0     265.0  -37.70830    144.9158         8870.0  \n6043        1.0     673.0  -37.76230    144.8272         4217.0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Type</th>\n      <th>Method</th>\n      <th>Regionname</th>\n      <th>Rooms</th>\n      <th>Distance</th>\n      <th>Postcode</th>\n      <th>Bedroom2</th>\n      <th>Bathroom</th>\n      <th>Landsize</th>\n      <th>Lattitude</th>\n      <th>Longtitude</th>\n      <th>Propertycount</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>12167</th>\n      <td>u</td>\n      <td>S</td>\n      <td>Southern Metropolitan</td>\n      <td>1</td>\n      <td>5.0</td>\n      <td>3182.0</td>\n      <td>1.0</td>\n      <td>1.0</td>\n      <td>0.0</td>\n      <td>-37.85984</td>\n      <td>144.9867</td>\n      <td>13240.0</td>\n    </tr>\n    <tr>\n      <th>6524</th>\n      <td>h</td>\n      <td>SA</td>\n      <td>Western Metropolitan</td>\n      <td>2</td>\n      <td>8.0</td>\n      <td>3016.0</td>\n      <td>2.0</td>\n      <td>2.0</td>\n      <td>193.0</td>\n      <td>-37.85800</td>\n      <td>144.9005</td>\n      <td>6380.0</td>\n    </tr>\n    <tr>\n      <th>8413</th>\n      <td>h</td>\n      <td>S</td>\n      <td>Western Metropolitan</td>\n      <td>3</td>\n      <td>12.6</td>\n      <td>3020.0</td>\n      <td>3.0</td>\n      <td>1.0</td>\n      <td>555.0</td>\n      <td>-37.79880</td>\n      <td>144.8220</td>\n      <td>3755.0</td>\n    </tr>\n    <tr>\n      <th>2919</th>\n      <td>u</td>\n      <td>SP</td>\n      <td>Northern Metropolitan</td>\n      <td>3</td>\n      <td>13.0</td>\n      <td>3046.0</td>\n      <td>3.0</td>\n      <td>1.0</td>\n      <td>265.0</td>\n      <td>-37.70830</td>\n      <td>144.9158</td>\n      <td>8870.0</td>\n    </tr>\n    <tr>\n      <th>6043</th>\n      <td>h</td>\n      <td>S</td>\n      <td>Western Metropolitan</td>\n      <td>3</td>\n      <td>13.3</td>\n      <td>3020.0</td>\n      <td>3.0</td>\n      <td>1.0</td>\n      <td>673.0</td>\n      <td>-37.76230</td>\n      <td>144.8272</td>\n      <td>4217.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.head()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "s = (X_train.dtypes == 'object')\n",
    "object_cols = list(s[s].index)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Type', 'Method', 'Regionname']\n"
     ]
    }
   ],
   "source": [
    "print(object_cols)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [],
   "source": [
    "def score_dataset(X_train, X_valid, y_train, y_valid):\n",
    "    from sklearn.ensemble import RandomForestRegressor\n",
    "    from sklearn.metrics import mean_absolute_error\n",
    "    model = RandomForestRegressor(n_estimators=10, random_state=0)\n",
    "    model.fit(X_train, y_train)\n",
    "    preds = model.predict(X_valid)\n",
    "    return mean_absolute_error(y_valid, preds)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "方法1的误差：\n",
      "183550.22137772635\n"
     ]
    }
   ],
   "source": [
    "drop_X_train = X_train.select_dtypes(exclude=['object'])\n",
    "drop_X_valid = X_valid.select_dtypes(exclude=['object'])\n",
    "\n",
    "print('方法1的误差：')\n",
    "print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "第二种方法误差\n",
      "175062.2967599411\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import OrdinalEncoder\n",
    "\n",
    "label_X_train = X_train.copy()\n",
    "label_X_valid = X_valid.copy()\n",
    "\n",
    "ordinal_encoder = OrdinalEncoder()\n",
    "label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])\n",
    "label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])\n",
    "\n",
    "print('第二种方法误差')\n",
    "print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "方法3误差\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\98413\\miniconda3\\envs\\kaggle-learn-courses\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:868: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "176703.63810751104\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "\n",
    "# 使用独热编码器\n",
    "OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)\n",
    "OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))\n",
    "OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))\n",
    "\n",
    "OH_cols_train.index = X_train.index\n",
    "OH_cols_valid.index = X_valid.index\n",
    "\n",
    "num_X_train = X_train.drop(object_cols, axis=1)\n",
    "num_X_valid = X_valid.drop(object_cols, axis=1)\n",
    "\n",
    "OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)\n",
    "OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)\n",
    "\n",
    "OH_X_train.columns = OH_X_train.columns.astype(str)\n",
    "OH_X_valid.columns = OH_X_valid.columns.astype(str)\n",
    "\n",
    "print('方法3误差')\n",
    "print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))"
   ],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
